/*
 
 Program cleaneneu_web.sas performs basic cleaning and consistency checks
 on ENEU data. 

 Kumler, Verhoogen, Frias "Enlisting Employees ..." REStat forthcoming

*/



****************************** housekeeping *************************************;

#delimit;
set more off;

do ${code}housekeeping.do;
set mem 8g;
set processors 8;


********************* get geocodes ready **************************;

clear;
use ${work}mpio_imss_inegi_zona;
keep entmpio_inegi zona;

bysort entmpio_inegi: keep if _n==1;

tempfile d1;
save `d1', replace;


*********************** initial cleaning of eneu datasets ********************;

**transfer files from sas to stata;

*loop over years 1987-2004;

forval yr = 1987/2004
{;

 display "year = `yr'";

 sh gunzip -cf ${work}eneu`yr'_web.sas7bdat.gz > ${tmp}eneu`yr'_web.sas7bdat;
 sh st ${tmp}eneu`yr'_web.sas7bdat ${tmp}eneu`yr'_web.dta -y;
 sh /bin/rm -f ${tmp}eneu`yr'_web.sas7bdat;
     
 use ${tmp}eneu`yr'_web, clear; 
 
 if `yr'<1994 keep municipi control numviv hogar hogarmud trh last_job edad 
	employee_type coworkers hrs_worked hrs_worked_cat pay_type aguinaldo_d 
	vacaciones_d utilidades_d imss_d issste_d segurovol_d credito_d medico_d otherben_d  factor
	male age married work_lwk work_nopay temp_abs new_job work_lyr fulltime
	flag include_3 include_3a rama subgrupo_cae89 city state mnthwage relate_mw 
	mnthhour hourwage esco1 firmsize89 periodo trimestre ene tipo p1a1
	dailywage p7a_89 p7b_89 p7c_89 occ numentre distsem_a estrato par  area ageb; *t_hij;
         
 if `yr'==1994 keep municipi control numviv hogar hogarmud trh last_job edad 
	employee_type coworkers hrs_worked hrs_worked_cat pay_type aguinaldo_d sar_d
	vacaciones_d utilidades_d imss_d issste_d segurovol_d credito_d medico_d otherben_d  factor
	male age married work_lwk work_nopay temp_abs new_job work_lyr fulltime
	flag include_3 include_3a rama subgrupo_cae89 city state mnthwage relate_mw
	mnthhour hourwage esco1 firmsize94 periodo trimestre ene tipo p1a1
	subgrupo_cae94 firmsize89 dailywage p7a_2_94 p7b_94 p7c_94 p7a_89 p7b_89 p7c_89 occ contract_type
	numentre distsem_a estrato par  area ageb; *  t_hij l_nac;
           
 if `yr'>1994 keep municipi control numviv hogar hogarmud trh last_job edad
	employee_type coworkers hrs_worked hrs_worked_cat pay_type aguinaldo_d 
	vacaciones_d utilidades_d imss_d issste_d sar_d credito_d medico_d otherben_d factor
	male age married work_lwk work_nopay temp_abs new_job work_lyr fulltime
	flag include_3 include_3a rama subgrupo_cae94 city state mnthwage relate_mw
	mnthhour hourwage esco1 firmsize94 periodo trimestre ene tipo p1a1
	dailywage p7a_2_94 p7b_94 p7c_94 occ contract_type numentre distsem_a estrato par  
	area ageb; *t_hij l_nac;
 
 *create geographic variables;
 rename municipi municipio;
 rename city metro_area;
 rename state entidad;
 
 tostring municipio entidad, replace;

 *entidad needs to be 2 digits and municipio needs to be 3 digits;
 replace entidad="0"+entidad if length(entidad)==1;
 replace municipio="00"+municipio if length(municipio)==1;
 replace municipio="0"+municipio if length(municipio)==2;
 
 gen entmpio_inegi = entidad + municipio;

 * must be >=12 yrs old to be asked empl questions;
 tab edad;
 drop edad;
 drop if age<12;
 
 * drop indiv w/missing age if were not asked empl questions;
 drop if age==. & p1a1==.;
 
 * must be adults with tipo==3;
 
 * Note: a couple of records with tipo==2 have ages but no other empl information
 in 87 & 88, so drop these records;
 
 tab tipo, missing;
 assert tipo==3, r;
 keep if tipo==3;
 drop tipo;
 
 qui sum eneu;
 local obs_`yr'1 = r(N);
 qui sum eneu if work_lwk==1;
 local obs_work_`yr'1 = r(N);
 qui sum eneu if include_3a==1;
 local obs_ques_`yr'1 = r(N);
 
 * Note: for some years, >30,000 obs/yr have p1a1==0 -- these do not seem to have
   any empl information, so drop them;
 
 tab p1a1, missing;
 assert p1a1==1 | p1a1==2, r;
 drop if p1a1==0;
 drop p1a1;
 
 qui sum eneu;
 local obs_`yr'2 = r(N);
 qui sum eneu if work_lwk==1;
 local obs_work_`yr'2 = r(N);
 qui sum eneu if include_3a==1;
 local obs_ques_`yr'2 = r(N);
 
 * should be male or female (male==1 or 2);
 * Note: one obs in 1987 has missing sex -- keep for now;
 
 tab male, missing;
 assert male==0 | male==1, r;
 
 *check for relate_mw variable;
 if `yr'<=1993 tab p7c_89 p7b_89 if p7a_89==999999, missing;
 if `yr'>1993 tab p7c_94 p7b_94 if p7a_2_94==999999, missing;
 drop p7b_* p7c_* p7a_*;
 
 *check that hrs_worked matches with the hrs_worked_cat variable;
 tab hrs_worked hrs_worked_cat;
 
 *weights should be >0;
 *127 workers in 1988 have factor=0;
 *314 workers in 1989 have factor=0;
 *314 workers in 1990 have factor=0;
 *172 workers in 1991 have factor=0;
 *215 workers in 1992 have factor=0;
 *848 workers in 1993 have factor=0;
 *24 workers in 1998 have factor=0;

 assert factor>0,r;                  
 tab factor if factor<=0;
 tab work_lwk if factor<=0;
 tab metro_area if factor<=0;
 tab metro_area if factor<=0 & work_lwk==1;	
 tab entidad if factor<=0;
 tab entidad if factor<=0 & work_lwk==1;
 drop if factor<=0;
 
 qui sum eneu;
 local obs_`yr'3 = r(N);
 qui sum eneu if work_lwk==1;
 local obs_work_`yr'3 = r(N);
 qui sum eneu if include_3a==1;
 local obs_ques_`yr'3 = r(N);
 
 *hours worked question could equal zero;
 *type of payment should not equal 0 if asked question and not
 *starting a new job or worked within last year;

 tab employee_type include_3a, missing;
 tab hrs_worked_cat include_3a if new_job==0 & work_lyr==0, missing;
 tab hrs_worked include_3a if new_job==0 & work_lyr==0, missing;
 tab pay_type include_3a if new_job==0 & work_lyr==0, missing;
 
 assert (employee_type==0 | employee_type==.) if include_3a==., r;
 assert (employee_type!=0 & employee_type!=.) if include_3a==1, r;
 assert hrs_worked==0 if include_3a==., r;
 assert hrs_worked!=0 if include_3a==1 & new_job==0 & work_lyr==0, r;
 assert hrs_worked_cat==0 if include_3a==., r;
 assert hrs_worked_cat!=0 if include_3a==1 & new_job==0 & work_lyr==0, r;
 assert pay_type==0 if include_3a==., r;
 assert pay_type!=0 if include_3a==1 & new_job==0 & work_lyr==0, r;

 * create schooling categories;
 gen esco_cat = 0 if esco1<=6 & esco1~=.;
 replace esco_cat = 1 if esco1==6;
 replace esco_cat = 2 if esco1==7 | esco1==8;
 replace esco_cat = 3 if esco1==9;
 replace esco_cat = 4 if esco1==10 | esco1==11;
 replace esco_cat = 5 if esco1==12;
 replace esco_cat = 6 if esco1==13 | esco1==14 | esco1==15;
 replace esco_cat = 7 if esco1==16;
 replace esco_cat = 8 if esco1>=16 & esco1~=.;
	
 * Note: firmsize variable question changes slightly between 
   qtrs 3 & 4 in 1994. Prior to 1994, individuals can respond that the firm 
   size is zero, which suggests the individual is the only worker at the firm.
   After 1994, individuals cannot respond with a firm size of zero, so a response
   of 1 worker suggests that it could just be that individual. In practice, the
   only types of workers that respond with a zero firm size prior to 1994 are
   trabajadors por su cuenta, which we drop. So will harmonize the firmsize
   variable assuming that indviduals are including themselves in the firm size
   question;
 
 * Note: follow the codes from the post 1994 survey (below). Thus,
   can just subtract one from the pre 1994 survey to harmonize:
   1 = 1 worker
   2 = 2-5 workers
   3 = 6-10 workers
   4 = 11-15 workers
   5 = 16-50 workers
   6 = 51-100 workers
   7 = 101-250 workers
   8 = 250+ workers;
 
 gen byte firmsize=.;
 if `yr'<=1994 {;
  replace firmsize=firmsize89-1 if periodo<=1993 | (periodo==1994 & trimestre<=2);
  replace firmsize=. if firmsize89==10 | firmsize89==11;
  drop firmsize89;
 };
 if `yr'>=1994 {; 
  replace firmsize=firmsize94 if periodo>=1995 | (periodo==1994 & trimestre>=3);
  replace firmsize=. if firmsize94==9 | firmsize94==99;
 };
    
 qui gen byte age_cat=.;
 forval num = 15(5)60
 {;
  qui replace age_cat=(`num'/5)-2 if (age>`num' & age<=`num'+5);
 };

 gen byte age_cat2=.;
 replace age_cat2 = 1 if age_cat==1 | age_cat==2;
 replace age_cat2 = 2 if age_cat==3 | age_cat==4;
 replace age_cat2 = 3 if age_cat==5 | age_cat==6;
 replace age_cat2 = 4 if age_cat==7 | age_cat==8;
 replace age_cat2 = 5 if age_cat==9 | age_cat==10;
 
 * Permanent variable to try to match the fact that IMSS only includes
   permanent workers;
 gen perm=(hrs_worked>20 & hrs_worked!=.);
 
 ** create IMSS indicator for hh head;
 
 * check for duplicates with household id and hh head;
 
 * below, i am only using control numviv hogar hogarmud for the hh id because
   there is a problem using municipi area and ageb across years. however, 
   since i am not trying to link over time here, i add these variables. adding municipi
   area and ageb gets rid of duplicate hh for all years except 1987 and 1988;
   
 * however, there are duplicate heads within household prior to 1997.;
 
 duplicates report control numviv hogar hogarmud municipi area ageb trh trimestre;
 duplicates report control numviv hogar hogarmud municipi area ageb trimestre if par==1;
 
 gen imss_head=.;
 gen issste_head=.;
 replace imss_head=0 if par==1; 
 replace issste_head=0 if par==1; 
 replace imss_head=1 if imss_d==1 & par==1;
 replace issste_head=1 if issste_d==1 & par==1;
 bys control numviv hogar hogarmud trimestre: egen imss_head1 = total(imss_head), missing;
 bys control numviv hogar hogarmud trimestre: egen issste_head1 = total(issste_head), missing;

 * there are some households that do not have a head, causing missing value for
   the IMSS indicator for the househead variable;
 tab imss_head1, m; 
 drop imss_head;
 gen imss_head = .;
 replace imss_head = 0 if imss_head1==0;
 replace imss_head = 1 if imss_head1>0 & imss_head1!=.;
 drop imss_head1;
 
 tab issste_head1, m; 
 drop issste_head;
 gen issste_head = .;
 replace issste_head = 0 if issste_head1==0;
 replace issste_head = 1 if issste_head1>0 & issste_head1!=.;
 drop issste_head1;
 
 
 desc;
 
 gsave ${tmp}eneu`yr'_web_2, replace;
 sh /bin/rm ${tmp}eneu`yr'_web.dta;
 
};


***************** stack yearly datasets ***********;

*** stack;

clear;
set mem 5g;

*append yearly data sets to create large dataset;

guse ${tmp}eneu1987_web_2, clear;
     
forval yr = 1988/2004
{;
 display "year = `yr'";
 sh gunzip -cf ${tmp}eneu`yr'_web_2.dta.gz > ${tmp}eneu`yr'_web_2.dta;
 append using ${tmp}eneu`yr'_web_2;
 sh /bin/rm -f ${tmp}eneu`yr'_web_2.dta;
};

compress;

***************** create individual identifiers ***********;

*create interview number;

* Note: Creating interview number using the relationship
  between the interview number, panel, and yr/qtr after 1994. Prior to 1994,
  the interview number does not exist, only panel and yr/qtr.;
   
gen byte int_no = .;
replace int_no = 1  
	if 
	(distsem_a=="1" & (
		(periodo==1987 & trimestre==4) | (periodo==1989 & trimestre==1) | (periodo==1990 & trimestre==2) | (periodo==1991 & trimestre==3) |
		(periodo==1992 & trimestre==4) | (periodo==1994 & trimestre==1) | (periodo==1995 & trimestre==2) | (periodo==1996 & trimestre==3) |
		(periodo==1997 & trimestre==4) | (periodo==1999 & trimestre==1) | (periodo==2000 & trimestre==2) | (periodo==2001 & trimestre==3) |
		(periodo==2002 & trimestre==4) | (periodo==2004 & trimestre==1)
		 )) |
	(distsem_a=="2" & (
		(periodo==1988 & trimestre==1) | (periodo==1989 & trimestre==2) | (periodo==1990 & trimestre==3) | (periodo==1991 & trimestre==4) |
		(periodo==1993 & trimestre==1) | (periodo==1994 & trimestre==2) | (periodo==1995 & trimestre==3) | (periodo==1996 & trimestre==4) |
		(periodo==1998 & trimestre==1) | (periodo==1999 & trimestre==2) | (periodo==2000 & trimestre==3) | (periodo==2001 & trimestre==4) |
		(periodo==2003 & trimestre==1) | (periodo==2004 & trimestre==2) 
		 )) |
	(distsem_a=="3" & (
		(periodo==1987 & trimestre==1) | (periodo==1988 & trimestre==2) | (periodo==1989 & trimestre==3) | (periodo==1990 & trimestre==4) |
		(periodo==1992 & trimestre==1) | (periodo==1993 & trimestre==2) | (periodo==1994 & trimestre==3) | (periodo==1995 & trimestre==4) |
		(periodo==1997 & trimestre==1) | (periodo==1998 & trimestre==2) | (periodo==1999 & trimestre==3) | (periodo==2000 & trimestre==4) |
		(periodo==2002 & trimestre==1) | (periodo==2003 & trimestre==2) | (periodo==2004 & trimestre==3) 
		 )) |
	(distsem_a=="4" & (
		(periodo==1987 & trimestre==2) | (periodo==1988 & trimestre==3) | (periodo==1989 & trimestre==4) | (periodo==1991 & trimestre==1) |
		(periodo==1992 & trimestre==2) | (periodo==1993 & trimestre==3) | (periodo==1994 & trimestre==4) | (periodo==1996 & trimestre==1) | 
		(periodo==1997 & trimestre==2) | (periodo==1998 & trimestre==3) | (periodo==1999 & trimestre==4) | (periodo==2001 & trimestre==1) | 
		(periodo==2002 & trimestre==2) | (periodo==2003 & trimestre==3) | (periodo==2004 & trimestre==4)
		 )) |
	(distsem_a=="5" & (
		(periodo==1987 & trimestre==3) | (periodo==1988 & trimestre==4) | (periodo==1990 & trimestre==1) | (periodo==1991 & trimestre==2) | 
		(periodo==1992 & trimestre==3) | (periodo==1993 & trimestre==4) | (periodo==1995 & trimestre==1) | (periodo==1996 & trimestre==2) | 
		(periodo==1997 & trimestre==3) | (periodo==1998 & trimestre==4) | (periodo==2000 & trimestre==1) | (periodo==2001 & trimestre==2) | 
		(periodo==2002 & trimestre==3) | (periodo==2003 & trimestre==4) 
		 ));
replace int_no = 2   
	if 
	(distsem_a=="5" & (
		(periodo==1987 & trimestre==4) | (periodo==1989 & trimestre==1) | (periodo==1990 & trimestre==2) | (periodo==1991 & trimestre==3) |
		(periodo==1992 & trimestre==4) | (periodo==1994 & trimestre==1) | (periodo==1995 & trimestre==2) | (periodo==1996 & trimestre==3) |
		(periodo==1997 & trimestre==4) | (periodo==1999 & trimestre==1) | (periodo==2000 & trimestre==2) | (periodo==2001 & trimestre==3) |
		(periodo==2002 & trimestre==4) | (periodo==2004 & trimestre==1)
		 )) |
	(distsem_a=="1" & (
		(periodo==1988 & trimestre==1) | (periodo==1989 & trimestre==2) | (periodo==1990 & trimestre==3) | (periodo==1991 & trimestre==4) |
		(periodo==1993 & trimestre==1) | (periodo==1994 & trimestre==2) | (periodo==1995 & trimestre==3) | (periodo==1996 & trimestre==4) |
		(periodo==1998 & trimestre==1) | (periodo==1999 & trimestre==2) | (periodo==2000 & trimestre==3) | (periodo==2001 & trimestre==4) |
		(periodo==2003 & trimestre==1) | (periodo==2004 & trimestre==2) 
		 )) |
	(distsem_a=="2" & (
		(periodo==1987 & trimestre==1) | (periodo==1988 & trimestre==2) | (periodo==1989 & trimestre==3) | (periodo==1990 & trimestre==4) |
		(periodo==1992 & trimestre==1) | (periodo==1993 & trimestre==2) | (periodo==1994 & trimestre==3) | (periodo==1995 & trimestre==4) |
		(periodo==1997 & trimestre==1) | (periodo==1998 & trimestre==2) | (periodo==1999 & trimestre==3) | (periodo==2000 & trimestre==4) |
		(periodo==2002 & trimestre==1) | (periodo==2003 & trimestre==2) | (periodo==2004 & trimestre==3) 
		 )) |
	(distsem_a=="3" & (
		(periodo==1987 & trimestre==2) | (periodo==1988 & trimestre==3) | (periodo==1989 & trimestre==4) | (periodo==1991 & trimestre==1) |
		(periodo==1992 & trimestre==2) | (periodo==1993 & trimestre==3) | (periodo==1994 & trimestre==4) | (periodo==1996 & trimestre==1) | 
		(periodo==1997 & trimestre==2) | (periodo==1998 & trimestre==3) | (periodo==1999 & trimestre==4) | (periodo==2001 & trimestre==1) | 
		(periodo==2002 & trimestre==2) | (periodo==2003 & trimestre==3) | (periodo==2004 & trimestre==4)
		 )) |
	(distsem_a=="4" & (
		(periodo==1987 & trimestre==3) | (periodo==1988 & trimestre==4) | (periodo==1990 & trimestre==1) | (periodo==1991 & trimestre==2) | 
		(periodo==1992 & trimestre==3) | (periodo==1993 & trimestre==4) | (periodo==1995 & trimestre==1) | (periodo==1996 & trimestre==2) | 
		(periodo==1997 & trimestre==3) | (periodo==1998 & trimestre==4) | (periodo==2000 & trimestre==1) | (periodo==2001 & trimestre==2) | 
		(periodo==2002 & trimestre==3) | (periodo==2003 & trimestre==4) 
		 ));
replace int_no = 3   
	if 
	(distsem_a=="4" & (
		(periodo==1987 & trimestre==4) | (periodo==1989 & trimestre==1) | (periodo==1990 & trimestre==2) | (periodo==1991 & trimestre==3) |
		(periodo==1992 & trimestre==4) | (periodo==1994 & trimestre==1) | (periodo==1995 & trimestre==2) | (periodo==1996 & trimestre==3) |
		(periodo==1997 & trimestre==4) | (periodo==1999 & trimestre==1) | (periodo==2000 & trimestre==2) | (periodo==2001 & trimestre==3) |
		(periodo==2002 & trimestre==4) | (periodo==2004 & trimestre==1)
		 )) |
	(distsem_a=="5" & (
		(periodo==1988 & trimestre==1) | (periodo==1989 & trimestre==2) | (periodo==1990 & trimestre==3) | (periodo==1991 & trimestre==4) |
		(periodo==1993 & trimestre==1) | (periodo==1994 & trimestre==2) | (periodo==1995 & trimestre==3) | (periodo==1996 & trimestre==4) |
		(periodo==1998 & trimestre==1) | (periodo==1999 & trimestre==2) | (periodo==2000 & trimestre==3) | (periodo==2001 & trimestre==4) |
		(periodo==2003 & trimestre==1) | (periodo==2004 & trimestre==2) 
		 )) |
	(distsem_a=="1" & (
		(periodo==1987 & trimestre==1) | (periodo==1988 & trimestre==2) | (periodo==1989 & trimestre==3) | (periodo==1990 & trimestre==4) |
		(periodo==1992 & trimestre==1) | (periodo==1993 & trimestre==2) | (periodo==1994 & trimestre==3) | (periodo==1995 & trimestre==4) |
		(periodo==1997 & trimestre==1) | (periodo==1998 & trimestre==2) | (periodo==1999 & trimestre==3) | (periodo==2000 & trimestre==4) |
		(periodo==2002 & trimestre==1) | (periodo==2003 & trimestre==2) | (periodo==2004 & trimestre==3) 
		 )) |
	(distsem_a=="2" & (
		(periodo==1987 & trimestre==2) | (periodo==1988 & trimestre==3) | (periodo==1989 & trimestre==4) | (periodo==1991 & trimestre==1) |
		(periodo==1992 & trimestre==2) | (periodo==1993 & trimestre==3) | (periodo==1994 & trimestre==4) | (periodo==1996 & trimestre==1) | 
		(periodo==1997 & trimestre==2) | (periodo==1998 & trimestre==3) | (periodo==1999 & trimestre==4) | (periodo==2001 & trimestre==1) | 
		(periodo==2002 & trimestre==2) | (periodo==2003 & trimestre==3) | (periodo==2004 & trimestre==4)
		 )) |
	(distsem_a=="3" & (
		(periodo==1987 & trimestre==3) | (periodo==1988 & trimestre==4) | (periodo==1990 & trimestre==1) | (periodo==1991 & trimestre==2) | 
		(periodo==1992 & trimestre==3) | (periodo==1993 & trimestre==4) | (periodo==1995 & trimestre==1) | (periodo==1996 & trimestre==2) | 
		(periodo==1997 & trimestre==3) | (periodo==1998 & trimestre==4) | (periodo==2000 & trimestre==1) | (periodo==2001 & trimestre==2) | 
		(periodo==2002 & trimestre==3) | (periodo==2003 & trimestre==4) 
		 ));
replace int_no = 4   
	if 
	(distsem_a=="3" & (
		(periodo==1987 & trimestre==4) | (periodo==1989 & trimestre==1) | (periodo==1990 & trimestre==2) | (periodo==1991 & trimestre==3) |
		(periodo==1992 & trimestre==4) | (periodo==1994 & trimestre==1) | (periodo==1995 & trimestre==2) | (periodo==1996 & trimestre==3) |
		(periodo==1997 & trimestre==4) | (periodo==1999 & trimestre==1) | (periodo==2000 & trimestre==2) | (periodo==2001 & trimestre==3) |
		(periodo==2002 & trimestre==4) | (periodo==2004 & trimestre==1)
		 )) |
	(distsem_a=="4" & (
		(periodo==1988 & trimestre==1) | (periodo==1989 & trimestre==2) | (periodo==1990 & trimestre==3) | (periodo==1991 & trimestre==4) |
		(periodo==1993 & trimestre==1) | (periodo==1994 & trimestre==2) | (periodo==1995 & trimestre==3) | (periodo==1996 & trimestre==4) |
		(periodo==1998 & trimestre==1) | (periodo==1999 & trimestre==2) | (periodo==2000 & trimestre==3) | (periodo==2001 & trimestre==4) |
		(periodo==2003 & trimestre==1) | (periodo==2004 & trimestre==2) 
		 )) |
	(distsem_a=="5" & (
		(periodo==1987 & trimestre==1) | (periodo==1988 & trimestre==2) | (periodo==1989 & trimestre==3) | (periodo==1990 & trimestre==4) |
		(periodo==1992 & trimestre==1) | (periodo==1993 & trimestre==2) | (periodo==1994 & trimestre==3) | (periodo==1995 & trimestre==4) |
		(periodo==1997 & trimestre==1) | (periodo==1998 & trimestre==2) | (periodo==1999 & trimestre==3) | (periodo==2000 & trimestre==4) |
		(periodo==2002 & trimestre==1) | (periodo==2003 & trimestre==2) | (periodo==2004 & trimestre==3) 
		 )) |
	(distsem_a=="1" & (
		(periodo==1987 & trimestre==2) | (periodo==1988 & trimestre==3) | (periodo==1989 & trimestre==4) | (periodo==1991 & trimestre==1) |
		(periodo==1992 & trimestre==2) | (periodo==1993 & trimestre==3) | (periodo==1994 & trimestre==4) | (periodo==1996 & trimestre==1) | 
		(periodo==1997 & trimestre==2) | (periodo==1998 & trimestre==3) | (periodo==1999 & trimestre==4) | (periodo==2001 & trimestre==1) | 
		(periodo==2002 & trimestre==2) | (periodo==2003 & trimestre==3) | (periodo==2004 & trimestre==4)
		 )) |
	(distsem_a=="2" & (
		(periodo==1987 & trimestre==3) | (periodo==1988 & trimestre==4) | (periodo==1990 & trimestre==1) | (periodo==1991 & trimestre==2) | 
		(periodo==1992 & trimestre==3) | (periodo==1993 & trimestre==4) | (periodo==1995 & trimestre==1) | (periodo==1996 & trimestre==2) | 
		(periodo==1997 & trimestre==3) | (periodo==1998 & trimestre==4) | (periodo==2000 & trimestre==1) | (periodo==2001 & trimestre==2) | 
		(periodo==2002 & trimestre==3) | (periodo==2003 & trimestre==4) 
		 ));
replace int_no = 5   
	if 
	(distsem_a=="2" & (
		(periodo==1987 & trimestre==4) | (periodo==1989 & trimestre==1) | (periodo==1990 & trimestre==2) | (periodo==1991 & trimestre==3) |
		(periodo==1992 & trimestre==4) | (periodo==1994 & trimestre==1) | (periodo==1995 & trimestre==2) | (periodo==1996 & trimestre==3) |
		(periodo==1997 & trimestre==4) | (periodo==1999 & trimestre==1) | (periodo==2000 & trimestre==2) | (periodo==2001 & trimestre==3) |
		(periodo==2002 & trimestre==4) | (periodo==2004 & trimestre==1)
		 )) |
	(distsem_a=="3" & (
		(periodo==1988 & trimestre==1) | (periodo==1989 & trimestre==2) | (periodo==1990 & trimestre==3) | (periodo==1991 & trimestre==4) |
		(periodo==1993 & trimestre==1) | (periodo==1994 & trimestre==2) | (periodo==1995 & trimestre==3) | (periodo==1996 & trimestre==4) |
		(periodo==1998 & trimestre==1) | (periodo==1999 & trimestre==2) | (periodo==2000 & trimestre==3) | (periodo==2001 & trimestre==4) |
		(periodo==2003 & trimestre==1) | (periodo==2004 & trimestre==2) 
		 )) |
	(distsem_a=="4" & (
		(periodo==1987 & trimestre==1) | (periodo==1988 & trimestre==2) | (periodo==1989 & trimestre==3) | (periodo==1990 & trimestre==4) |
		(periodo==1992 & trimestre==1) | (periodo==1993 & trimestre==2) | (periodo==1994 & trimestre==3) | (periodo==1995 & trimestre==4) |
		(periodo==1997 & trimestre==1) | (periodo==1998 & trimestre==2) | (periodo==1999 & trimestre==3) | (periodo==2000 & trimestre==4) |
		(periodo==2002 & trimestre==1) | (periodo==2003 & trimestre==2) | (periodo==2004 & trimestre==3) 
		 )) |
	(distsem_a=="5" & (
		(periodo==1987 & trimestre==2) | (periodo==1988 & trimestre==3) | (periodo==1989 & trimestre==4) | (periodo==1991 & trimestre==1) |
		(periodo==1992 & trimestre==2) | (periodo==1993 & trimestre==3) | (periodo==1994 & trimestre==4) | (periodo==1996 & trimestre==1) | 
		(periodo==1997 & trimestre==2) | (periodo==1998 & trimestre==3) | (periodo==1999 & trimestre==4) | (periodo==2001 & trimestre==1) | 
		(periodo==2002 & trimestre==2) | (periodo==2003 & trimestre==3) | (periodo==2004 & trimestre==4)
		 )) |
	(distsem_a=="1" & (
		(periodo==1987 & trimestre==3) | (periodo==1988 & trimestre==4) | (periodo==1990 & trimestre==1) | (periodo==1991 & trimestre==2) | 
		(periodo==1992 & trimestre==3) | (periodo==1993 & trimestre==4) | (periodo==1995 & trimestre==1) | (periodo==1996 & trimestre==2) | 
		(periodo==1997 & trimestre==3) | (periodo==1998 & trimestre==4) | (periodo==2000 & trimestre==1) | (periodo==2001 & trimestre==2) | 
		(periodo==2002 & trimestre==3) | (periodo==2003 & trimestre==4) 
		 ));

tab int_no numentre, missing;
tab periodo int_no, missing;
		 
*create person id;

egen pers_id = group(control numviv hogar hogarmud trh); 

duplicates tag pers_id periodo trimestre, gen(duplicate);
drop if duplicate==1;	

sort pers_id periodo trimestre int_no;

egen unique_id=group(pers_id periodo trimestre) if int_no==1;
egen unique_id2=group(pers_id periodo trimestre) if int_no==2;
egen unique_id3=group(pers_id periodo trimestre) if int_no==3;
egen unique_id4=group(pers_id periodo trimestre) if int_no==4;
egen unique_id5=group(pers_id periodo trimestre) if int_no==5;

replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+1) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+1) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==4)) &
	unique_id==.;

* These 3 lines account for missing quarters;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+2) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+2) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==3) | 
	((periodo==periodo[_n-1]+1) & trimestre==2 & trimestre[_n-1]==4)) &
	unique_id==.;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+3) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+3) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==2) | 
	((periodo==periodo[_n-1]+1) & trimestre==2 & trimestre[_n-1]==3) | 
	((periodo==periodo[_n-1]+1) & trimestre==3 & trimestre[_n-1]==4)) &
	unique_id==.;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+4) & 
	(((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==1) | 
	((periodo==periodo[_n-1]+1) & trimestre==2 & trimestre[_n-1]==2) | 
	((periodo==periodo[_n-1]+1) & trimestre==3 & trimestre[_n-1]==3) | 
	((periodo==periodo[_n-1]+1) & trimestre==4 & trimestre[_n-1]==4)) &
	unique_id==.;
	
sum unique_id;
scalar max_id=r(max);

replace unique_id=unique_id2+max_id if unique_id==.;

replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+1) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+1) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==4)) &
	unique_id==.;

* These 2 lines account for missing quarters;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+2) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+2) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==3) | 
	((periodo==periodo[_n-1]+1) & trimestre==2 & trimestre[_n-1]==4)) &
	unique_id==.;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+3) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+3) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==2) | 
	((periodo==periodo[_n-1]+1) & trimestre==2 & trimestre[_n-1]==3) | 
	((periodo==periodo[_n-1]+1) & trimestre==3 & trimestre[_n-1]==4)) &
	unique_id==.;


sum unique_id;
scalar max_id=r(max);

replace unique_id=unique_id3+max_id if unique_id==.;

replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+1) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+1) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==4)) &
	unique_id==.;
* This line accounts for missing quarters;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+2) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+2) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==3) | 
	((periodo==periodo[_n-1]+1) & trimestre==2 & trimestre[_n-1]==4)) &
	unique_id==.;

sum unique_id;
scalar max_id=r(max);

replace unique_id=unique_id4+max_id if unique_id==.;

replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+1) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+1) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==4)) &
	unique_id==.;

sum unique_id;
scalar max_id=r(max);

replace unique_id=unique_id5+max_id if unique_id==.;

*this should not replace any observations!;
replace unique_id=unique_id[_n-1] if pers_id==pers_id[_n-1] & 
	(int_no==int_no[_n-1]+1) & 
	((periodo==periodo[_n-1] & trimestre==trimestre[_n-1]+1) | 
	((periodo==periodo[_n-1]+1) & trimestre==1 & trimestre[_n-1]==4)) &
	unique_id==.;

sum unique_id;
	
*count number of quarters in data;
bys unique_id: egen qtr_in=count(unique_id);

tab qtr_in, m;

sort unique_id int_no;

by unique_id: egen male_m=mean(male);
by unique_id: gen age_lag1 = age[_n-1];

gen byte age_diff = .;
replace age_diff = 0 if inrange(age,age_lag,age_lag+1) & age_lag!=.;
replace age_diff = 1 if !inrange(age,age_lag,age_lag+1) & age_lag!=.;

tab age_diff, m;

by unique_id: egen byte age_diff1 = sum(age_diff);
gen byte age_diff_total = .;
replace age_diff_total = 1 if age_diff1>0 & age_diff1!=.;
replace age_diff_total = 0 if age_diff1==0 | age_diff1==.;

tab age_diff_total;
tab age_diff age_diff_total;

gen byte fe_flag=.;
replace fe_flag=1 if male_m!=0 & male_m!=1;
replace fe_flag=1 if age_diff_total>0;

tab fe_flag;

drop male_m age_diff_total age_diff1 age_diff age_lag1;

******************* create wage variables ************************;

** merge in geographic concordance to match to min wage zones;

sort entmpio_inegi;

merge entmpio_inegi using `d1';

tab _merge;

* _merge==2 if ENTMPIO_INEGI code has an IMSS code but is not in ENE;

drop if _merge==2;
drop _merge;

* Note from Todd: _merge==1 for DF (ENTMPIO_INEGI=09***), which is
  because DF does not merge, so manually assign to zona A;

* assign zona based on state if missing municipio, for states completely in one zone;

replace zona="A" if zona=="" & inlist(substr(entmpio_inegi, 1, 2),"02","03","09");
replace zona="C" if zona=="" & (inlist(substr(entmpio_inegi,1,2),"01","04","05","06","07","10","11")
   | inlist(substr(entmpio_inegi,1,2),"13","16","17","18","20","21","22")
   |  inlist(substr(entmpio_inegi,1,2),"23","24","25","27","29","31","32"));

* assign remaining obs based on zone that predominates in state;

replace zona = "C" if zona=="" & entmpio_inegi=="12000" | entmpio_inegi=="30319";
replace zona = "B" if zona=="" & entmpio_inegi=="14000" | entmpio_inegi=="19000" | entmpio=="26000";
replace zona = "A" if zona=="" & entmpio_inegi=="15000" | entmpio_inegi=="15157";

tab zona, missing;

**** merge in minimum wages;

rename periodo year;
rename trimestre qtr;

sort year qtr zona;

merge year qtr zona using ${work}minwages_wtopcodes.dta, sort uniqusing;

*merge==2 for years before and after ENEU data;
tab _merge;
tab year if _merge==2;
drop if _merge==2;

* merge==1 for individuals with no zona info -- there should not be any after cleaning above;
tab zona if _merge==1, missing;

drop _merge;

*** generate basic wage variables;

rename hourwage hsal;

count if hsal==.; * should be zero;

* generate daily wage if wage is defined relative to min wage;

replace dailywage=salmin*relate_mw if hsal==. & relate_mw<99;

rename dailywage sal;

* Note (7/13/11): the following two commands were not in Todds version
  of cleanene_web.do, which I copied over from the Compliance project
  today;

* use daily wage information to infer hourly wage;
replace hsal = sal/8 if hsal==. & sal~=.;
* use daily wage information to infer hourly wage;
replace sal = hsal*8 if hsal~=. & sal==.;

**** generate additional wage variables (without additional censoring);

gen rhsal1 = hsal/(cpi/100);
gen rsal1 = sal/(cpi/100);

*use top- and bottom-codes for each year -- call new variable rhsal2;
     
gen rsal2=rsal1;
replace rsal2 = salmin/(cpi/100) if rsal1<salmin/(cpi/100) & rsal1!=.;
replace rsal2 = topsal/(cpi/100) if rsal1>topsal/(cpi/100) & rsal1!=.;

*use bottom-codes by zone and top-codes for entire period
*call new variable rsal3;
     
gen rsal3=rsal1;
replace rsal3 = rsalmin91_byzone if rsal1<rsalmin91_byzone & rsal1!=.;
replace rsal3 = rtopsalmin_byzone if rsal1>rtopsalmin_byzone & rsal1!=.;
     
* use global (i.e. not by zone) bottom-codes and top-codes for
  entire period -- call new variable rsal4;

gen rsal4=rsal1;
replace rsal4 = rsalmin91_zonea if rsal1<rsalmin91_zonea & rsal1!=.;
replace rsal4 = rtopsalmin_zonec if rsal1>rtopsalmin_zonec & rsal1!=.;
  
* winsorize at 5/95 level within year -- call new variable rsal5;
     
gen rhsal5=rhsal1;
gen rsal5=rsal1;
bys year qtr: egen rhsal5_p95 = pctile(rhsal1), p(95);
bys year qtr: egen rhsal5_p5 = pctile(rhsal1), p(5);
bys year qtr: egen rsal5_p95 = pctile(rsal1), p(95);
bys year qtr: egen rsal5_p5 = pctile(rsal1), p(5);
replace rhsal5 = rhsal5_p95 if rhsal1>rhsal5_p95 & rhsal1!=.;
replace rhsal5 = rhsal5_p5 if rhsal1<rhsal5_p5 & rhsal1!=.;
replace rsal5 = rsal5_p95 if rsal1>rsal5_p95 & rsal1!=.;
replace rsal5 = rsal5_p5 if rsal1<rsal5_p5 & rsal1!=.;
drop rhsal5_p95 rhsal5_p5 rsal5_p95 rsal5_p5;
     
* winsorize at 10/90 level within year -- call new variable rsal6;

gen rhsal6=rhsal1;
gen rsal6=rsal1;
bys year qtr: egen rhsal6_p90 = pctile(rhsal1), p(90);
bys year qtr: egen rhsal6_p10 = pctile(rhsal1), p(10);
bys year qtr: egen rsal6_p90 = pctile(rsal1), p(90);
bys year qtr: egen rsal6_p10 = pctile(rsal1), p(10);
replace rhsal6 = rhsal6_p90 if rhsal1>rhsal6_p90 & rhsal1!=.;
replace rhsal6 = rhsal6_p10 if rhsal1<rhsal6_p10 & rhsal1!=.;
replace rsal6 = rsal6_p90 if rsal1>rsal6_p90 & rsal1!=.;
replace rsal6 = rsal6_p10 if rsal1<rsal6_p10 & rsal1!=.;
drop rhsal6_p90 rhsal6_p10 rsal6_p90 rsal6_p10;

** indicators for work status;

qui gen byte work = 1;
qui gen byte work_all = (inrange(hrs_worked,1,98) & pay_type!=9 & sal!=. & sal!=0);
qui gen byte work_ft = (inrange(hrs_worked,1,98) & pay_type!=9 & sal!=. & sal!=0 & fulltime==1);
qui gen byte work_imss = (inrange(hrs_worked,1,98) & pay_type!=9 & sal!=. & sal!=0 & imss_d==1);
qui gen byte work_issste = (inrange(hrs_worked,1,98) & pay_type!=9 & sal!=. & sal!=0 & issste_d==1);
qui gen byte work_ft_imss = (inrange(hrs_worked,1,98) & pay_type!=9 & sal!=. & sal!=0 & fulltime==1 & imss_d==1);
qui gen byte work_ft_issste = (inrange(hrs_worked,1,98) & pay_type!=9 & sal!=. & sal!=0 & fulltime==1 & issste_d==1);

keep pers_id unique_id qtr_in numentre  male age age_cat* sal metro_area factor ene subgrupo* last_job
 employee_type hrs_worked pay_type imss_d work_lwk work_nopay temp_abs
 new_job work_lyr fulltime rama mnthwage mnthhour firmsize*
 qtr year entmpio_inegi rhsal* rsal* hsal sal relate_mw salmin salmindf zona
 cpi work work_all work_ft work_imss work_ft_imss esco* occ married contract_type fe_flag
 imss_head issste_head par work_issste work_ft_issste int_no;

compress;
 
gsave ${work}eneu19872004_web_allind, replace;



***************** select industries *********************;

*subgrupo def change btw qtrs 3 & 4 in 1994;
*assign them the same variable name for now;

guse ${work}eneu19872004_web_allind;

gen int subgrupo=.;

replace subgrupo=subgrupo_cae89 if year<=1993 | (year==1994 & qtr<=2);
replace subgrupo=subgrupo_cae94 if year>=1995 | (year==1994 & qtr>=3);

* create one-digit industry codes;

gen ind=.;
replace ind=0 if subgrupo>=101 & subgrupo<=449 & (year>1994 | (year==1994 & qtr>2));
replace ind=0 if subgrupo>=101 & subgrupo<=419 & (year<1994 | (year==1994 & qtr<=2));
replace ind=1 if subgrupo>=501 & subgrupo<=1039 & (year>1994 | (year==1994 & qtr>2));
replace ind=1 if subgrupo>=501 & subgrupo<=1029 & (year<1994 | (year==1994 & qtr<=2));
replace ind=2 if subgrupo>=1101 & subgrupo<=5949 & (year>1994 | (year==1994 & qtr>2));
replace ind=2 if subgrupo>=1101 & subgrupo<=5939 & (year<1994 | (year==1994 & qtr<=2));
replace ind=4 if subgrupo>=6001 & subgrupo<=6017 & (year>1994 | (year==1994 & qtr>2));
replace ind=4 if subgrupo>=6001 & subgrupo<=6016 & (year<1994 | (year==1994 & qtr<=2));
replace ind=5 if subgrupo>=6101 & subgrupo<=6121 & (year>1994 | (year==1994 & qtr>2));
replace ind=5 if subgrupo>=6101 & (year<1994 | (year==1994 & qtr<=2));
replace ind=6 if subgrupo>=6201 & subgrupo<=6323 & (year>1994 | (year==1994 & qtr>2));
replace ind=6 if subgrupo>=6201 & subgrupo<=6323 & (year<1994 | (year==1994 & qtr<=2));
replace ind=7 if subgrupo>=6401 & subgrupo<=6531 & (year>1994 | (year==1994 & qtr>2));
replace ind=7 if subgrupo>=6401 & subgrupo<=6531 & (year<1994 | (year==1994 & qtr<=2));
replace ind=8 if subgrupo>=6601 & subgrupo<=6833 & (year>1994 | (year==1994 & qtr>2));
replace ind=8 if subgrupo>=6601 & subgrupo<=6834 & (year<1994 | (year==1994 & qtr<=2));
replace ind=9 if subgrupo>=6901 & subgrupo<=7401 & (year>1994 | (year==1994 & qtr>2));
replace ind=9 if subgrupo>=6901 & subgrupo<=7321 & (year<1994 | (year==1994 & qtr<=2));
tab ind, missing;
tab year ind, missing;

destring rama, replace;

tab rama year, m;

* include Social Security employees (subgrupo 7321), to do check;

keep if inrange(rama,11,32) | inrange(rama,35,59) |
rama==60 | inrange(rama,62,63) | subgrupo==7321;

gsave ${work}eneu19872004_web, replace;

sh /bin/rm ${tmp}eneu????_web_2.dta.gz;

log close;


